import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
import re
import statsmodels.api as sm
from pylab import *
homestay = pd.read_csv('AB_NYC_2019.csv')
homestay.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
len(homestay) # check amount of rows
48895
homestay.dtypes # check types of the columns
id int64 name object host_id int64 host_name object neighbourhood_group object neighbourhood object latitude float64 longitude float64 room_type object price int64 minimum_nights int64 number_of_reviews int64 last_review object reviews_per_month float64 calculated_host_listings_count int64 availability_365 int64 dtype: object
homestay.drop(['id','host_name','last_review'], axis=1, inplace=True) # Drop the 'host_name' not only because it is insignificant for analysis but also for ethical reasons.
homestay.isnull().sum()# check missing values
name 16 host_id 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
homestay.fillna({'reviews_per_month':0},inplace=True) # If there were no reviews for the listing, review_per_month" simply will not exist. So, we can simply append it with 0.0 for missing values.
homestay.isnull().sum() # recheck missing values
name 16 host_id 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 reviews_per_month 0 calculated_host_listings_count 0 availability_365 0 dtype: int64
homestay.duplicated().sum()
0
homestay["price"].describe()
# The numerical distribution of the price reveals a mean value of 153.1. However, the price range extends from 0 to 10000, indicating the presence of some outliers.
count 48895.000000 mean 152.720687 std 240.154170 min 0.000000 25% 69.000000 50% 106.000000 75% 175.000000 max 10000.000000 Name: price, dtype: float64
homestay['price'].hist() # most of them are less than 1000
homestay["price"][homestay["price"]<400].hist()
homestay = homestay[homestay["price"]<=400]
homestay["price"].describe()
count 47132.000000 mean 126.405202 std 78.172200 min 0.000000 25% 67.000000 50% 100.000000 75% 165.000000 max 400.000000 Name: price, dtype: float64
homestay['minimum_nights'].describe()
count 47132.000000 mean 6.974391 std 20.343423 min 1.000000 25% 1.000000 50% 2.000000 75% 5.000000 max 1250.000000 Name: minimum_nights, dtype: float64
homestay['minimum_nights'].hist()
<AxesSubplot: >
homestay['minimum_nights'][homestay['minimum_nights']<100].hist()
<AxesSubplot: >
homestay.loc[(homestay.minimum_nights >30),'minimum_nights'] = 30
homestay['minimum_nights'][homestay['minimum_nights']<30].hist() # better distribution
<AxesSubplot: >
for column in homestay.select_dtypes(include = object):
print(column,'counts:\n',homestay[column].value_counts(dropna = False))
print('\n\n')
name counts:
Hillside Hotel 18
Home away from home 17
NaN 16
New york Multi-unit building 14
Brooklyn Apartment 12
..
Unique 2BR Apartment 1
STUNNING ONE BEDROOM IN THE HEART OF NEW YORK CITY 1
One bedroom in Beautiful Astoria with balcony! 1
Elegantly designed 1bd room apt 1
Trendy duplex in the very heart of Hell's Kitchen 1
Name: name, Length: 46177, dtype: int64
neighbourhood_group counts:
Manhattan 20366
Brooklyn 19712
Queens 5612
Bronx 1077
Staten Island 365
Name: neighbourhood_group, dtype: int64
neighbourhood counts:
Williamsburg 3823
Bedford-Stuyvesant 3667
Harlem 2617
Bushwick 2454
Upper West Side 1867
...
Silver Lake 2
Richmondtown 1
New Dorp 1
Rossville 1
Willowbrook 1
Name: neighbourhood, Length: 219, dtype: int64
room_type counts:
Entire home/apt 23874
Private room 22110
Shared room 1148
Name: room_type, dtype: int64
top_host=homestay.host_id.value_counts().head(10)
top_host_df=pd.DataFrame(top_host)
top_host_df.reset_index(inplace=True)
top_host_df.rename(columns={'index':'Host_ID', 'host_id':'Count'}, inplace=True)
top_host_df = top_host_df.sort_values(by='Count', ascending=False) # count decending rank
plt.figure(figsize=(14, 10))
viz_1 = sns.barplot(x="Host_ID", y="Count", data=top_host_df, order=top_host_df.sort_values('Count', ascending=False).Host_ID, palette='Blues_d')
viz_1.set_title('Hosts with the most listings in NYC',fontsize = 20)
viz_1.set_ylabel('Count of listings',fontsize = 14)
viz_1.set_xlabel('Host IDs')
viz_1.set_xticklabels(viz_1.get_xticklabels(), rotation=45)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
for p in viz_1.patches:
viz_1.annotate(format(p.get_height(), '.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 5),
textcoords = 'offset points')
plt.show()
neighbourhood_counts = homestay['neighbourhood_group'].value_counts()
plt.figure(figsize=(8, 6))
neighbourhood_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title("Neighbourhood Group Counts")
plt.show()
sub_1 = homestay.loc[homestay['neighbourhood_group'] == 'Brooklyn']
price_sub1 = sub_1[['price']]
sub_2 = homestay.loc[homestay['neighbourhood_group'] == 'Manhattan']
price_sub2 = sub_2[['price']]
sub_3 = homestay.loc[homestay['neighbourhood_group'] == 'Queens']
price_sub3 = sub_3[['price']]
sub_4 = homestay.loc[homestay['neighbourhood_group'] == 'Staten Island']
price_sub4 = sub_4[['price']]
sub_5 = homestay.loc[homestay['neighbourhood_group'] == 'Bronx']
price_sub5 = sub_5[['price']]
price_list_by_n = [price_sub1, price_sub2, price_sub3, price_sub4, price_sub5]
p_l_b_n_2=[]
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']
for x in price_list_by_n:
i=x.describe(percentiles=[.25, .50, .75])
i=i.iloc[3:]
i.reset_index(inplace=True)
i.rename(columns={'index':'Stats'}, inplace=True)
p_l_b_n_2.append(i)
p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)
stat_df=p_l_b_n_2
stat_df=[df.set_index('Stats') for df in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df
| Brooklyn | Manhattan | Queens | Staten Island | Bronx | |
|---|---|---|---|---|---|
| Stats | |||||
| min | 0.0 | 0.0 | 10.0 | 13.0 | 0.0 |
| 25% | 60.0 | 90.0 | 50.0 | 50.0 | 45.0 |
| 50% | 90.0 | 140.0 | 75.0 | 75.0 | 65.0 |
| 75% | 145.0 | 200.0 | 110.0 | 105.0 | 98.0 |
| max | 400.0 | 400.0 | 400.0 | 300.0 | 399.0 |
# using violinplot to showcase density and distribtuion of prices
homestay0 = homestay[homestay.price < 500]
viz_2=sns.violinplot(data=homestay0, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighbourhood_group')
room_type_counts = homestay['room_type'].value_counts()
plt.figure(figsize=(8, 6))
room_type_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title("Room Type Counts")
plt.show()
#using violinplot to showcase density and distribtuion of prices
viz_2=sns.violinplot(data=homestay0, x='room_type', y='price')
viz_2.set_title('Density and distribution of prices for each room type')
homestay.neighbourhood.value_counts().head(10)
Williamsburg 3823 Bedford-Stuyvesant 3667 Harlem 2617 Bushwick 2454 Upper West Side 1867 Hell's Kitchen 1834 East Village 1774 Upper East Side 1724 Crown Heights 1538 Midtown 1301 Name: neighbourhood, dtype: int64
sub_7 = homestay.loc[homestay['neighbourhood'].isin(['Williamsburg', 'Bedford-Stuyvesant', 'Harlem', 'Bushwick', 'Upper West Side', 'Hell\'s Kitchen', 'East Village', 'Upper East Side', 'Crown Heights', 'Midtown'])]
viz_3 = sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=sub_7, kind='count')
viz_3.set_xticklabels(rotation=70)
plt.show()
plt.figure(figsize=(20, 10))
<Figure size 2000x1000 with 0 Axes>
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Filter non-strings
name_list = [str(name) for name in homestay.name if isinstance(name, str)]
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
background_color='white',
width=2000,
height=1000
).generate(" ".join(name_list))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
homestay.drop(['host_id',"latitude",'longitude'], axis=1, inplace=True)
homestay1 = pd.get_dummies(homestay,columns = ['neighbourhood_group','room_type'],drop_first=True)
homestay1.drop(["neighbourhood"], axis=1, inplace=True)
corr = homestay1.corr(method='kendall', numeric_only=True)
plt.figure(figsize = (15, 15))
ax = sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, linewidth = 0.2, cmap = 'YlGnBu', annot = True, annot_kws={"fontsize":14}) ## annot:把值打出来,annot_kws: 标注字体大小
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 20)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 20)
plt.title('Heatmap of Correlation Coefficient',fontsize = 25)
plt.show()
homestay2 = homestay1.copy()
homestay2['name'].fillna('', inplace=True)
homestay2['name'].isnull().sum()
0
def remove_punctuation_digits_specialchar(line):
return re.sub('[^A-Za-z]+', ' ', line).lower()
homestay2['clean_name'] = homestay2['name'].apply(remove_punctuation_digits_specialchar)
homestay2[['name', 'clean_name']].tail()
| name | clean_name | |
|---|---|---|
| 48890 | Charming one bedroom - newly renovated rowhouse | charming one bedroom newly renovated rowhouse |
| 48891 | Affordable room in Bushwick/East Williamsburg | affordable room in bushwick east williamsburg |
| 48892 | Sunny Studio at Historical Neighborhood | sunny studio at historical neighborhood |
| 48893 | 43rd St. Time Square-cozy single bed | rd st time square cozy single bed |
| 48894 | Trendy duplex in the very heart of Hell's Kitchen | trendy duplex in the very heart of hell s kitchen |
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
def tokenize_no_stopwords(line):
tokens = nltk.tokenize.word_tokenize(line)
tokens_no_stop = [w for w in tokens if w not in stopwords.words('english')]
return " ".join(tokens_no_stop)
homestay2['final_name'] = homestay2['clean_name'].apply(tokenize_no_stopwords)
homestay2[['name', 'clean_name', 'final_name']].head()
| name | clean_name | final_name | |
|---|---|---|---|
| 0 | Clean & quiet apt home by the park | clean quiet apt home by the park | clean quiet apt home park |
| 1 | Skylit Midtown Castle | skylit midtown castle | skylit midtown castle |
| 2 | THE VILLAGE OF HARLEM....NEW YORK ! | the village of harlem new york | village harlem new york |
| 3 | Cozy Entire Floor of Brownstone | cozy entire floor of brownstone | cozy entire floor brownstone |
| 4 | Entire Apt: Spacious Studio/Loft by central park | entire apt spacious studio loft by central park | entire apt spacious studio loft central park |
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, classification_report, mean_absolute_error, r2_score
from imblearn.over_sampling import SMOTE
def classify_price_category(price):
if price > 300:
return 2
elif price > 100:
return 1
else:
return 0
homestay2['target'] = homestay2['price'].apply(classify_price_category)
homestay2['target'].value_counts()
train, test = train_test_split(homestay2, test_size=0.1, random_state=4, stratify=homestay2['target'])
X_train, y_train = train['final_name'], train['target']
X_test, y_test = test['final_name'], test['target']
vect = TfidfVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
# over-sampling
smote = SMOTE(random_state=4)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
homestay2['target'].value_counts()
0 23928 1 21610 2 1594 Name: target, dtype: int64
y_train_resampled_value_counts = y_train_resampled.value_counts()
print("SMOTE applied target distribution: ", y_train_resampled_value_counts)
SMOTE applied target distribution: 1 21535 0 21535 2 21535 Name: target, dtype: int64
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
dt = DecisionTreeClassifier(random_state=4)
dt.fit(X_train_resampled, y_train_resampled)
preds = dt.predict(X_test)
# confusion matrix
cm = confusion_matrix(y_test, preds)
label_map = {'0': 'low', '1': 'medium', '2': 'high'}
cm_headers = [f"actual {label_map[str(i)]}" for i in range(cm.shape[0])]
cm_columns = [f"predicted {label_map[str(i)]}" for i in range(cm.shape[1])]
cm = pd.DataFrame(cm, columns=cm_columns, index=cm_headers)
print("Confusion matrix:")
print(cm.astype(int))
# classification report
cr = classification_report(y_test, preds, target_names=label_map.values(), digits=3)
print("\nClassification report:")
print(cr)
Confusion matrix:
predicted low predicted medium predicted high
actual low 1655 700 38
actual medium 651 1365 145
actual high 31 94 35
Classification report:
precision recall f1-score support
low 0.708 0.692 0.700 2393
medium 0.632 0.632 0.632 2161
high 0.161 0.219 0.185 160
accuracy 0.648 4714
macro avg 0.500 0.514 0.506 4714
weighted avg 0.655 0.648 0.651 4714
lr = LGBMClassifier(random_state=4)
lr.fit(X_train_resampled, y_train_resampled)
preds = lr.predict(X_test)
# confusion matrix
cm = confusion_matrix(y_test, preds)
label_map = {'0': 'low', '1': 'medium', '2': 'high'}
cm_headers = [f"actual {label_map[str(i)]}" for i in range(cm.shape[0])]
cm_columns = [f"predicted {label_map[str(i)]}" for i in range(cm.shape[1])]
cm = pd.DataFrame(cm, columns=cm_columns, index=cm_headers)
print("Confusion matrix:")
print(cm.astype(int))
# classification report
cr = classification_report(y_test, preds, target_names=label_map.values(), digits=3)
print("\nClassification report:")
print(cr)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59554
[LightGBM] [Info] Number of data points in the train set: 64605, number of used features: 960
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Confusion matrix:
predicted low predicted medium predicted high
actual low 1754 607 32
actual medium 551 1491 119
actual high 26 94 40
Classification report:
precision recall f1-score support
low 0.752 0.733 0.743 2393
medium 0.680 0.690 0.685 2161
high 0.209 0.250 0.228 160
accuracy 0.697 4714
macro avg 0.547 0.558 0.552 4714
weighted avg 0.701 0.697 0.699 4714
homestay1.drop(['name'], axis=1, inplace=True)
X = homestay1.loc[:,homestay1.columns != 'price']
y = homestay1['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)
linreg = LinearRegression().fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print('R² = ',linreg.score(X_train, y_train).round(3))
print ('RMSE = %.3f'%np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
R² = 0.438 RMSE = 60.241
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred)),y_pred-y_test,'lightpink')
plt.plot(range(len(y_pred)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Linear Regression',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price plot
test_pred = pd.DataFrame(y_test)
test_pred['esti_price'] = y_pred
test_pred.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatter_LR',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
plt.xlim = ([0,400])
plt.ylim = ([0,500])
homestay3 = homestay.copy()
homestay3.drop(['name'], axis=1, inplace=True)
homestay3 = pd.get_dummies(homestay3, columns=['neighbourhood_group', 'neighbourhood', 'room_type'], drop_first=True)
X1 = homestay3.loc[:, homestay3.columns != 'price']
y1 = homestay3['price']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.1, random_state=4)
Lassoreg = Lasso(alpha=0.01)
Lassoreg.fit(X_train1, y_train1)
Lasso(alpha=0.01)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso(alpha=0.01)
coef_dict = dict(zip(X_train1.columns, Lassoreg.coef_.round(3)))
sorted(coef_dict.items(),key = lambda x:x[1],reverse = True)
[('neighbourhood_DUMBO', 75.086),
('neighbourhood_group_Manhattan', 64.846),
('neighbourhood_Tribeca', 56.408),
('neighbourhood_Vinegar Hill', 52.385),
('neighbourhood_NoHo', 37.058),
('neighbourhood_Boerum Hill', 36.569),
('neighbourhood_Downtown Brooklyn', 35.689),
('neighbourhood_Park Slope', 34.326),
('neighbourhood_Williamsburg', 33.877),
('neighbourhood_Cobble Hill', 33.682),
('neighbourhood_Midtown', 32.98),
('neighbourhood_Gowanus', 32.047),
('neighbourhood_Carroll Gardens', 29.884),
('neighbourhood_Brooklyn Heights', 29.879),
('neighbourhood_South Slope', 29.444),
('neighbourhood_Fort Greene', 28.481),
('neighbourhood_Flatiron District', 27.209),
('neighbourhood_Greenpoint', 25.317),
('neighbourhood_Prospect Heights', 24.843),
('neighbourhood_Theater District', 24.548),
('neighbourhood_West Village', 24.164),
('neighbourhood_Long Island City', 23.895),
('neighbourhood_Clinton Hill', 22.603),
('neighbourhood_Arverne', 21.059),
('neighbourhood_Greenwich Village', 20.044),
('neighbourhood_Chelsea', 19.712),
('neighbourhood_SoHo', 19.386),
('neighbourhood_Nolita', 18.854),
("neighbourhood_Hell's Kitchen", 13.83),
('neighbourhood_Windsor Terrace', 13.51),
('neighbourhood_group_Brooklyn', 13.026),
('neighbourhood_Red Hook', 9.032),
('neighbourhood_Breezy Point', 8.188),
('neighbourhood_group_Queens', 7.999),
('neighbourhood_Murray Hill', 7.202),
('neighbourhood_Columbia St', 7.011),
('neighbourhood_Astoria', 6.877),
('neighbourhood_Battery Park City', 6.643),
('neighbourhood_Gramercy', 5.376),
('neighbourhood_Financial District', 3.608),
('neighbourhood_Crown Heights', 3.023),
('neighbourhood_Elmhurst', 2.115),
('neighbourhood_Woodside', 1.374),
('neighbourhood_Ditmars Steinway', 1.03),
('neighbourhood_Bedford-Stuyvesant', 1.023),
('neighbourhood_Clason Point', 0.75),
('calculated_host_listings_count', 0.124),
('availability_365', 0.088),
('neighbourhood_Forest Hills', 0.057),
('neighbourhood_Arden Heights', -0.0),
('neighbourhood_Arrochar', -0.0),
('neighbourhood_Bath Beach', -0.0),
('neighbourhood_Bay Terrace', 0.0),
('neighbourhood_Bay Terrace, Staten Island', -0.0),
('neighbourhood_Baychester', -0.0),
('neighbourhood_Bayside', 0.0),
('neighbourhood_Bayswater', -0.0),
('neighbourhood_Belle Harbor', 0.0),
('neighbourhood_Bellerose', 0.0),
('neighbourhood_Belmont', 0.0),
('neighbourhood_Bergen Beach', -0.0),
('neighbourhood_Briarwood', 0.0),
("neighbourhood_Bull's Head", 0.0),
('neighbourhood_Bushwick', 0.0),
('neighbourhood_Cambria Heights', 0.0),
('neighbourhood_Castle Hill', -0.0),
('neighbourhood_Castleton Corners', 0.0),
('neighbourhood_City Island', -0.0),
('neighbourhood_Civic Center', 0.0),
('neighbourhood_Claremont Village', 0.0),
('neighbourhood_Clifton', -0.0),
('neighbourhood_Co-op City', 0.0),
('neighbourhood_College Point', -0.0),
('neighbourhood_Concord', -0.0),
('neighbourhood_Concourse', 0.0),
('neighbourhood_Concourse Village', -0.0),
('neighbourhood_Coney Island', 0.0),
('neighbourhood_Dongan Hills', -0.0),
('neighbourhood_Douglaston', -0.0),
('neighbourhood_Dyker Heights', -0.0),
('neighbourhood_East Elmhurst', -0.0),
('neighbourhood_East Morrisania', -0.0),
('neighbourhood_East Village', -0.0),
('neighbourhood_Eastchester', 0.0),
('neighbourhood_Edenwald', -0.0),
('neighbourhood_Edgemere', 0.0),
('neighbourhood_Eltingville', 0.0),
('neighbourhood_Emerson Hill', -0.0),
('neighbourhood_Far Rockaway', -0.0),
('neighbourhood_Fieldston', -0.0),
('neighbourhood_Flushing', 0.0),
('neighbourhood_Fordham', 0.0),
('neighbourhood_Fresh Meadows', 0.0),
('neighbourhood_Glendale', -0.0),
('neighbourhood_Graniteville', -0.0),
('neighbourhood_Grant City', -0.0),
('neighbourhood_Great Kills', 0.0),
('neighbourhood_Grymes Hill', 0.0),
('neighbourhood_Highbridge', -0.0),
('neighbourhood_Hollis', 0.0),
('neighbourhood_Holliswood', 0.0),
('neighbourhood_Howard Beach', 0.0),
('neighbourhood_Howland Hook', -0.0),
('neighbourhood_Huguenot', -0.0),
('neighbourhood_Hunts Point', -0.0),
('neighbourhood_Jamaica Estates', 0.0),
('neighbourhood_Jamaica Hills', 0.0),
('neighbourhood_Kew Gardens', 0.0),
('neighbourhood_Kew Gardens Hills', 0.0),
('neighbourhood_Kingsbridge', 0.0),
('neighbourhood_Laurelton', -0.0),
('neighbourhood_Lighthouse Hill', 0.0),
('neighbourhood_Little Italy', 0.0),
('neighbourhood_Little Neck', 0.0),
('neighbourhood_Longwood', 0.0),
('neighbourhood_Manhattan Beach', -0.0),
('neighbourhood_Mariners Harbor', 0.0),
('neighbourhood_Melrose', -0.0),
('neighbourhood_Middle Village', -0.0),
('neighbourhood_Midland Beach', -0.0),
('neighbourhood_Mill Basin', -0.0),
('neighbourhood_Morris Heights', 0.0),
('neighbourhood_Morris Park', -0.0),
('neighbourhood_Morrisania', 0.0),
('neighbourhood_Mott Haven', -0.0),
('neighbourhood_Mount Eden', -0.0),
('neighbourhood_Mount Hope', -0.0),
('neighbourhood_Navy Yard', 0.0),
('neighbourhood_Neponsit', 0.0),
('neighbourhood_New Brighton', 0.0),
('neighbourhood_New Dorp', -0.0),
('neighbourhood_New Dorp Beach', -0.0),
('neighbourhood_New Springville', -0.0),
('neighbourhood_North Riverdale', 0.0),
('neighbourhood_Norwood', 0.0),
('neighbourhood_Oakwood', -0.0),
('neighbourhood_Olinville', 0.0),
('neighbourhood_Parkchester', 0.0),
('neighbourhood_Pelham Bay', -0.0),
('neighbourhood_Port Morris', 0.0),
('neighbourhood_Port Richmond', 0.0),
("neighbourhood_Prince's Bay", 0.0),
('neighbourhood_Randall Manor', -0.0),
('neighbourhood_Rego Park', -0.0),
('neighbourhood_Richmondtown', -0.0),
('neighbourhood_Riverdale', 0.0),
('neighbourhood_Rockaway Beach', 0.0),
('neighbourhood_Rosebank', -0.0),
('neighbourhood_Rossville', -0.0),
('neighbourhood_Schuylerville', 0.0),
('neighbourhood_Sea Gate', 0.0),
('neighbourhood_Shore Acres', 0.0),
('neighbourhood_Silver Lake', 0.0),
('neighbourhood_Soundview', -0.0),
('neighbourhood_South Beach', 0.0),
('neighbourhood_South Ozone Park', -0.0),
('neighbourhood_Springfield Gardens', 0.0),
('neighbourhood_Spuyten Duyvil', 0.0),
('neighbourhood_St. Albans', 0.0),
('neighbourhood_St. George', 0.0),
('neighbourhood_Stapleton', 0.0),
('neighbourhood_Stuyvesant Town', -0.0),
('neighbourhood_Sunnyside', -0.0),
('neighbourhood_Throgs Neck', 0.0),
('neighbourhood_Todt Hill', 0.0),
('neighbourhood_Tompkinsville', -0.0),
('neighbourhood_Tottenville', 0.0),
('neighbourhood_Tremont', -0.0),
('neighbourhood_Unionport', -0.0),
('neighbourhood_University Heights', 0.0),
('neighbourhood_Van Nest', 0.0),
('neighbourhood_Wakefield', 0.0),
('neighbourhood_West Brighton', 0.0),
('neighbourhood_West Farms', 0.0),
('neighbourhood_Westchester Square', -0.0),
('neighbourhood_Westerleigh', -0.0),
('neighbourhood_Whitestone', 0.0),
('neighbourhood_Williamsbridge', -0.0),
('neighbourhood_Willowbrook', 0.0),
('neighbourhood_Woodlawn', -0.0),
('number_of_reviews', -0.085),
('neighbourhood_Richmond Hill', -0.11),
('neighbourhood_Sunset Park', -0.277),
('neighbourhood_Jackson Heights', -0.446),
('neighbourhood_Jamaica', -0.629),
('neighbourhood_Upper West Side', -0.67),
('neighbourhood_Ridgewood', -0.697),
('neighbourhood_group_Staten Island', -0.965),
('reviews_per_month', -1.219),
('minimum_nights', -1.245),
('neighbourhood_Rosedale', -2.026),
('neighbourhood_Gravesend', -2.042),
('neighbourhood_Bronxdale', -2.558),
('neighbourhood_Kensington', -2.774),
('neighbourhood_Bay Ridge', -2.89),
('neighbourhood_Fort Hamilton', -3.278),
('neighbourhood_Lower East Side', -3.599),
('neighbourhood_Kips Bay', -4.119),
('neighbourhood_Prospect-Lefferts Gardens', -4.589),
('neighbourhood_Brighton Beach', -4.689),
('neighbourhood_Brownsville', -5.238),
('neighbourhood_Maspeth', -5.756),
('neighbourhood_Sheepshead Bay', -6.12),
('neighbourhood_Two Bridges', -6.6),
('neighbourhood_East Flatbush', -7.486),
('neighbourhood_Chinatown', -7.671),
('neighbourhood_Cypress Hills', -7.989),
('neighbourhood_Pelham Gardens', -8.962),
('neighbourhood_Flatlands', -9.554),
('neighbourhood_Flatbush', -10.258),
('neighbourhood_Upper East Side', -10.938),
('neighbourhood_Woodhaven', -11.613),
('neighbourhood_Bensonhurst', -11.837),
('neighbourhood_Midwood', -12.191),
('neighbourhood_Corona', -12.343),
('neighbourhood_Canarsie', -13.17),
('neighbourhood_Queens Village', -13.415),
('neighbourhood_Ozone Park', -13.815),
('neighbourhood_East New York', -15.265),
('neighbourhood_Borough Park', -16.088),
('neighbourhood_Marble Hill', -22.158),
('neighbourhood_Roosevelt Island', -26.258),
('neighbourhood_East Harlem', -27.841),
('neighbourhood_Morningside Heights', -34.252),
('neighbourhood_Harlem', -35.712),
('neighbourhood_Washington Heights', -51.604),
('neighbourhood_Inwood', -55.222),
('room_type_Private room', -80.018),
('room_type_Shared room', -103.583)]
number_of_key_value_pairs = len(coef_dict)
print(f"The dictionary has {number_of_key_value_pairs} key-value pairs.")
The dictionary has 229 key-value pairs.
print('R² = ',Lassoreg.score(X_train1, y_train1).round(3))
y_pred1= Lassoreg.predict(X_test1)
print ('RMSE1 =%.3f'%np.sqrt(metrics.mean_squared_error(y_test1, y_pred1))) # Root Mean Square Error
R² = 0.501 RMSE1 =56.840
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred1)),y_pred1-y_test1,'lightpink')
plt.plot(range(len(y_pred1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Lasso Regression',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price scatter plot
y_pred1 = Lassoreg.predict(X_test1)
test_pred1 = pd.DataFrame(y_test1)
test_pred1['esti_price1'] = y_pred1
test_pred1.plot(x='price',y='esti_price1',kind='scatter',fontsize = 10)
plt.title('test_pred_scatter_Lasso',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
# distributed closer to 45° line
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import random
random.seed(4)
np.random.seed(4)
param_grid = {"ccp_alpha": np.linspace(0, 1, 11)}
reg_tree = DecisionTreeRegressor()
# Pruning
grid_search = GridSearchCV(reg_tree, param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
grid_search.fit(X_train1, y_train1)
print("Scores for each parameter combination:")
means = -grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
print("CCP Alpha: %0.3f, RMSE: %0.3f (+/- %0.3f)" % (params['ccp_alpha'], np.sqrt(mean), std))
best_alpha = grid_search.best_params_['ccp_alpha']
print("best alpha: ", best_alpha)
Scores for each parameter combination: CCP Alpha: 0.000, RMSE: 71.641 (+/- 144.632) CCP Alpha: 0.100, RMSE: 68.462 (+/- 142.786) CCP Alpha: 0.200, RMSE: 66.100 (+/- 156.040) CCP Alpha: 0.300, RMSE: 63.734 (+/- 152.811) CCP Alpha: 0.400, RMSE: 61.534 (+/- 157.203) CCP Alpha: 0.500, RMSE: 59.850 (+/- 151.028) CCP Alpha: 0.600, RMSE: 58.685 (+/- 157.464) CCP Alpha: 0.700, RMSE: 57.309 (+/- 131.820) CCP Alpha: 0.800, RMSE: 56.392 (+/- 122.917) CCP Alpha: 0.900, RMSE: 55.851 (+/- 110.634) CCP Alpha: 1.000, RMSE: 55.752 (+/- 102.677) best alpha: 1.0
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
# Fit using the best alpha
feature_names = list(X_train1.columns)
best_reg_tree = DecisionTreeRegressor(ccp_alpha=best_alpha, random_state=4)
best_reg_tree.fit(X_train1, y_train1)
plt.figure(figsize=(40, 20))
# set max_depth = 3
plot_tree(best_reg_tree, filled=True, feature_names=feature_names, class_names=['price'],
rounded=True, precision=2, impurity=True, node_ids=True, proportion=False,
fontsize=18, label='all', max_depth=3)
y_pred1 = best_reg_tree.predict(X_test1)
mse = mean_squared_error(y_test1, y_pred1)
rmse = np.sqrt(mse)
print("RMSE = ", round(rmse, 3))
print('R² = ', best_reg_tree.score(X_train1, y_train1).round(3))
RMSE = 56.828 R² = 0.568
regrf = RandomForestRegressor(n_estimators=300)
regrf.fit(X_train1, y_train1)
RandomForestRegressor(n_estimators=300)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(n_estimators=300)
# Feature importance
importances = regrf.feature_importances_
weights = pd.Series(importances,
index=X_train1.columns.values)
print(weights.sort_values(ascending = False).round(3))
plt.figure(figsize=(15,10))
plt.title('Feature Importances',fontsize = 20)
weights.sort_values()[-10:].plot(kind = 'barh')
plt.xlabel('Relative Importance',fontsize = 15)
room_type_Private room 0.316
availability_365 0.111
reviews_per_month 0.105
number_of_reviews 0.081
minimum_nights 0.070
...
neighbourhood_Richmondtown 0.000
neighbourhood_Westerleigh 0.000
neighbourhood_Rossville 0.000
neighbourhood_Co-op City 0.000
neighbourhood_Silver Lake 0.000
Length: 229, dtype: float64
Text(0.5, 0, 'Relative Importance')
print('R² = ',regrf.score(X_train1, y_train1).round(3))
y_predrf= regrf.predict(X_test1)
print('RMSE = ',np.sqrt(metrics.mean_squared_error(y_test1,y_predrf)).round(3))
# Coefficient of determinationR² is up to 89.6%,too high to overfitting
R² = 0.896 RMSE = 55.668
# Difference between test and pred
test_predrf = pd.DataFrame(y_test)
test_predrf['esti_price'] = y_predrf
plt.figure(figsize = (40,20))
plt.plot(range(len(y_predrf)),y_predrf-y_test1,'lightpink')
plt.plot(range(len(y_predrf)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Random Forest Regression before Selecting Parameters',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price scatter plot
test_predrf.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatterrf',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
# Tuning Parameters - Random Search
# Step1: Creating a Hyperparameter Grid
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
rm_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(rm_grid)
{'n_estimators': [200, 400, 600, 800, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 30, 50, 70, 90, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
# Step2&3: 3-fold cross-validation with random search to find the corresponding parameters with the highest score
# It will take hours to carry out this validation
rf1 = RandomForestRegressor()
# 3-fold cross-validation
rf1_random = RandomizedSearchCV(estimator = rf1, param_distributions = rm_grid, n_iter = 180, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf1_random.fit(X_train1, y_train1)
print(rf2_random.best_params_) # best parametre
'# Step2&3: 3-fold cross-validation with random search to find the corresponding parameters with the highest score\n\n# It will take hours to carry out this validation\nrf1 = RandomForestRegressor()\n# 3-fold cross-validation\nrf1_random = RandomizedSearchCV(estimator = rf1, param_distributions = rm_grid, n_iter = 180, cv = 3, verbose=2, random_state=42, n_jobs = -1)\nrf1_random.fit(X_train1, y_train1)\nprint(rf2_random.best_params_) # best parametre'
# rain a random forest model by using the optimal parameterst
regrf1 = RandomForestRegressor(n_estimators=400, max_depth = 110, min_samples_split = 10,min_samples_leaf =1,max_features = 'sqrt',bootstrap = True)
regrf1.fit(X_train1, y_train1)
RandomForestRegressor(max_depth=110, max_features='sqrt', min_samples_split=10,
n_estimators=400)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestRegressor(max_depth=110, max_features='sqrt', min_samples_split=10,
n_estimators=400)# R² decreased and risk of overfitting declined
print('R² = ',regrf1.score(X_train1, y_train1).round(3))
R² = 0.737
# feature importance again
importances1 = regrf1.feature_importances_
weights1 = pd.Series(importances1,
index=X_train1.columns.values)
print(weights1.sort_values(ascending = False).round(3))
plt.figure(figsize=(15,10))
plt.title('Feature Importances1',fontsize = 20)
weights1.sort_values()[-10:].plot(kind = 'barh')
plt.xlabel('Relative Importance1',fontsize = 15)
room_type_Private room 0.322
availability_365 0.090
reviews_per_month 0.076
calculated_host_listings_count 0.063
number_of_reviews 0.063
...
neighbourhood_Oakwood 0.000
neighbourhood_Rossville 0.000
neighbourhood_Todt Hill 0.000
neighbourhood_Co-op City 0.000
neighbourhood_Silver Lake 0.000
Length: 229, dtype: float64
Text(0.5, 0, 'Relative Importance1')
y_predrf1 = regrf1.predict(X_test1)
print('RMSE = ',np.sqrt(metrics.mean_squared_error(y_test1,y_predrf1)).round(3))
RMSE = 54.099
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_predrf1)),y_predrf1-y_test1,'lightpink')
plt.plot(range(len(y_predrf1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Random Forest Regression after Selecting Parameters',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price scatter plot
test_predrf1 = pd.DataFrame(y_test)
test_predrf1['esti_price'] = y_predrf1
test_predrf1.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatterrf1',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score
# primary
model = keras.Sequential([
layers.Dense(50, activation='relu', input_shape=[229]),
layers.Dense(50, activation='relu'),
layers.Dense(1)
])
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))
history = model.fit(X_train1, y_train1, validation_split=0.1, epochs= 50, batch_size=64)
loss = model.evaluate(X_test1, y_test1)
y_pred1 = model.predict(X_test1)
y_pred1 = y_pred1.flatten()
RMSE = np.sqrt(mean_squared_error(y_test1, y_pred1))
r2 = r2_score(y_test1, y_pred1)
print ('RMSE = %.3f'%RMSE)
print ('R² = %.3f'%r2)
C:\Users\Tommy\AppData\Roaming\Python\Python311\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Epoch 1/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 14270.0303 - val_loss: 5613.9502 Epoch 2/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 4309.5264 - val_loss: 3465.5352 Epoch 3/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3139.7681 - val_loss: 3279.2512 Epoch 4/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3019.5059 - val_loss: 3204.2534 Epoch 5/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3063.4666 - val_loss: 3174.3452 Epoch 6/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3002.1589 - val_loss: 3181.0767 Epoch 7/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2952.7341 - val_loss: 3184.8140 Epoch 8/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2867.6155 - val_loss: 3159.6880 Epoch 9/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2906.6179 - val_loss: 3325.4902 Epoch 10/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2967.6970 - val_loss: 3297.1353 Epoch 11/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2940.2827 - val_loss: 3313.5857 Epoch 12/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2933.9275 - val_loss: 3233.2585 Epoch 13/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2869.5593 - val_loss: 3183.4407 Epoch 14/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2917.6016 - val_loss: 3132.5483 Epoch 15/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2918.0779 - val_loss: 3145.6440 Epoch 16/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2947.0562 - val_loss: 3223.9087 Epoch 17/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2941.7246 - val_loss: 3189.1008 Epoch 18/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2862.6282 - val_loss: 3486.5457 Epoch 19/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2914.2515 - val_loss: 3173.9697 Epoch 20/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2920.6282 - val_loss: 3261.3159 Epoch 21/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2940.2378 - val_loss: 3168.7754 Epoch 22/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2815.1086 - val_loss: 3176.0923 Epoch 23/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2807.4758 - val_loss: 3061.6084 Epoch 24/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2820.5669 - val_loss: 3159.4500 Epoch 25/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2801.5427 - val_loss: 3083.7334 Epoch 26/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2807.3088 - val_loss: 3272.0383 Epoch 27/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2834.4709 - val_loss: 3087.1653 Epoch 28/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2926.2922 - val_loss: 3067.8433 Epoch 29/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2761.8306 - val_loss: 3082.7502 Epoch 30/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2762.4680 - val_loss: 3258.2122 Epoch 31/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2799.4119 - val_loss: 3072.9504 Epoch 32/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2794.4707 - val_loss: 3064.9441 Epoch 33/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2804.5110 - val_loss: 3184.1548 Epoch 34/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2897.9736 - val_loss: 3063.5586 Epoch 35/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2755.7126 - val_loss: 3151.7876 Epoch 36/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2792.2480 - val_loss: 3129.2773 Epoch 37/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2785.3416 - val_loss: 3168.3606 Epoch 38/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2851.5212 - val_loss: 3106.2759 Epoch 39/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2820.0664 - val_loss: 3070.3213 Epoch 40/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2810.0220 - val_loss: 3077.5725 Epoch 41/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2842.0059 - val_loss: 3117.4302 Epoch 42/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2793.7520 - val_loss: 3098.4836 Epoch 43/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2829.1147 - val_loss: 3086.9243 Epoch 44/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2764.5374 - val_loss: 3054.6953 Epoch 45/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2778.7917 - val_loss: 3192.8110 Epoch 46/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2779.3521 - val_loss: 3049.4778 Epoch 47/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2741.2744 - val_loss: 3154.0405 Epoch 48/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2784.9775 - val_loss: 3116.7498 Epoch 49/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2727.0957 - val_loss: 3049.5222 Epoch 50/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2732.6794 - val_loss: 3051.8562 148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 998us/step - loss: 2932.3076 148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step RMSE = 55.133 R² = 0.529
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred1)),y_pred1-y_test1,'lightpink')
plt.plot(range(len(y_pred1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Neutral Network',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price scatter plot
test_prednn = pd.DataFrame(y_test)
test_prednn['esti_price'] = y_pred1
test_prednn.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatternn',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
# L1 regularization
l1_regularization_strength = 0.01
# primary
model = keras.Sequential([
layers.Dense(50, activation='relu', input_shape=[229],
kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength)),
layers.Dense(50, activation='relu',
kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength)),
layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength))
])
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.0001))
history = model.fit(X_train1, y_train1, validation_split=0.1, epochs=50, batch_size=64)
loss = model.evaluate(X_test1, y_test1)
y_pred1 = model.predict(X_test1)
y_pred1 = y_pred1.flatten()
RMSE = np.sqrt(mean_squared_error(y_test1, y_pred1))
r2 = r2_score(y_test1, y_pred1)
print('RMSE = %.3f' % RMSE)
print('R² = %.3f' % r2)
Epoch 1/50
C:\Users\Tommy\AppData\Roaming\Python\Python311\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 20542.7969 - val_loss: 13567.7109 Epoch 2/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12837.7803 - val_loss: 12305.3818 Epoch 3/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11504.5684 - val_loss: 10999.7314 Epoch 4/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 10191.1875 - val_loss: 10080.3906 Epoch 5/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 9504.1758 - val_loss: 9165.3750 Epoch 6/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 8650.1934 - val_loss: 8036.6064 Epoch 7/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 7337.7583 - val_loss: 6868.9292 Epoch 8/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 6265.2163 - val_loss: 5882.5054 Epoch 9/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 5396.6641 - val_loss: 5141.2520 Epoch 10/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 4748.8843 - val_loss: 4587.0337 Epoch 11/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 4213.9956 - val_loss: 4259.2466 Epoch 12/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3891.1885 - val_loss: 3911.1287 Epoch 13/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3635.6692 - val_loss: 3693.4150 Epoch 14/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3477.2400 - val_loss: 3545.7544 Epoch 15/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3279.8562 - val_loss: 3465.7449 Epoch 16/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3169.5723 - val_loss: 3388.4314 Epoch 17/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3141.9705 - val_loss: 3352.7188 Epoch 18/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3109.7812 - val_loss: 3415.8994 Epoch 19/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3110.6162 - val_loss: 3317.2009 Epoch 20/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3038.2485 - val_loss: 3306.9614 Epoch 21/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3085.8132 - val_loss: 3379.8074 Epoch 22/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2989.7073 - val_loss: 3271.5540 Epoch 23/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3064.6990 - val_loss: 3263.1897 Epoch 24/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3064.9290 - val_loss: 3243.4165 Epoch 25/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3045.0242 - val_loss: 3247.7808 Epoch 26/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3024.4351 - val_loss: 3236.9470 Epoch 27/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2963.3347 - val_loss: 3232.5823 Epoch 28/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3030.8843 - val_loss: 3227.7363 Epoch 29/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2977.7139 - val_loss: 3252.9941 Epoch 30/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 2977.2830 - val_loss: 3257.7603 Epoch 31/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2924.6086 - val_loss: 3198.8594 Epoch 32/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2887.2996 - val_loss: 3202.9199 Epoch 33/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2913.2458 - val_loss: 3248.2896 Epoch 34/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2984.6938 - val_loss: 3234.5540 Epoch 35/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2901.1604 - val_loss: 3219.9175 Epoch 36/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2952.9514 - val_loss: 3185.1863 Epoch 37/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2967.8337 - val_loss: 3178.2781 Epoch 38/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2943.8218 - val_loss: 3179.9934 Epoch 39/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2896.6150 - val_loss: 3168.2979 Epoch 40/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2914.4419 - val_loss: 3169.5037 Epoch 41/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2969.8904 - val_loss: 3163.6829 Epoch 42/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2937.4021 - val_loss: 3169.7246 Epoch 43/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2891.6396 - val_loss: 3168.5266 Epoch 44/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2886.7256 - val_loss: 3207.7920 Epoch 45/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2887.6250 - val_loss: 3164.1267 Epoch 46/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2905.2673 - val_loss: 3169.6577 Epoch 47/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2883.9983 - val_loss: 3159.8779 Epoch 48/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2906.4841 - val_loss: 3167.1526 Epoch 49/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2949.9744 - val_loss: 3164.5361 Epoch 50/50 597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2904.4216 - val_loss: 3165.9866 148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 956us/step - loss: 3010.7915 148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step RMSE = 55.586 R² = 0.521
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred)),y_pred-y_test1,'lightpink')
plt.plot(range(len(y_pred)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of L1 regularization Neutral Network',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
# estimated - actual price scatter plot
test_prednn1 = pd.DataFrame(y_test1)
test_prednn1['esti_price'] = y_pred1
test_prednn1.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatternn1',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')